In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from matplotlib import pyplot as plt
from plotly.subplots import make_subplots
import plotly.graph_objects as go
In [2]:
data = pd.read_csv('C:/Users/jsree/OneDrive/Documents/Python projects/Kaggle/Diamond Price/diamonds.csv')
In [3]:
data.head()
Out[3]:
Unnamed: 0 carat cut color clarity depth table price x y z
0 1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
1 2 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
2 3 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
3 4 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
4 5 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
In [4]:
print(f'Data contain {data.shape[0]} diamonds and {data.shape[1]} columns.')
Data contain 53940 diamonds and 11 columns.
In [5]:
data.dtypes
Out[5]:
Unnamed: 0      int64
carat         float64
cut            object
color          object
clarity        object
depth         float64
table         float64
price           int64
x             float64
y             float64
z             float64
dtype: object
In [6]:
data.describe().T
Out[6]:
count mean std min 25% 50% 75% max
Unnamed: 0 53940.0 26970.500000 15571.281097 1.0 13485.75 26970.50 40455.25 53940.00
carat 53940.0 0.797940 0.474011 0.2 0.40 0.70 1.04 5.01
depth 53940.0 61.749405 1.432621 43.0 61.00 61.80 62.50 79.00
table 53940.0 57.457184 2.234491 43.0 56.00 57.00 59.00 95.00
price 53940.0 3932.799722 3989.439738 326.0 950.00 2401.00 5324.25 18823.00
x 53940.0 5.731157 1.121761 0.0 4.71 5.70 6.54 10.74
y 53940.0 5.734526 1.142135 0.0 4.72 5.71 6.54 58.90
z 53940.0 3.538734 0.705699 0.0 2.91 3.53 4.04 31.80
In [7]:
count = data['cut'].value_counts()
count
Out[7]:
Ideal        21551
Premium      13791
Very Good    12082
Good          4906
Fair          1610
Name: cut, dtype: int64
In [8]:
sns.displot(data=data, x=data["cut"])
Out[8]:
<seaborn.axisgrid.FacetGrid at 0x233cf204370>
In [9]:
sns.barplot(x ="cut", y ="price", data = data)
Out[9]:
<AxesSubplot:xlabel='cut', ylabel='price'>
In [10]:
data['price'].describe()
Out[10]:
count    53940.000000
mean      3932.799722
std       3989.439738
min        326.000000
25%        950.000000
50%       2401.000000
75%       5324.250000
max      18823.000000
Name: price, dtype: float64
In [11]:
fig_6 = make_subplots(rows=1, cols=1, specs=[[{'type': 'xy'}]])
# Setting Box parameters
fig_6.add_trace(go.Box(x=data['price'], 
                       name='p'))
In [12]:
fig_6.update_traces(marker_color='salmon')

# Setting the parameters of the Box when displaying
fig_6.update_layout(showlegend=False, 
                    template='simple_white', 
                    font=dict(family='Arial', 
                              size=8, 
                              color='black'))

# Displaying the Box
fig_6.show()

The price of diamonds ranges between (326 −18.823k ) and its average is equal to 2401$¶

In [13]:
count = data['color'].value_counts()
count
Out[13]:
G    11292
E     9797
F     9542
H     8304
D     6775
I     5422
J     2808
Name: color, dtype: int64

Display color column Color of Diamonds D, E, F -These are the whitest diamonds G, H – These stones have a very slight hue of yellow (or gray or brown). I, J – These stones have light to medium hue of yellow (or gray or brown). K, L – These stones have a strong hue of yellow (or gray or brown). M, N – These stones have a very strong hue of yellow (or gray or brown). N – Z – Any stone which is more yellow than N and less yellow than Z is called CAPE or DARK CAPE, Fancy Yellow – Any stone which is more yellow than Z is called Fancy Yellow and its value is higher.

In [14]:
sns.displot(data=data, x=data["color"], hue="cut", multiple="stack")
Out[14]:
<seaborn.axisgrid.FacetGrid at 0x233cf204c40>

Count the diamonds according to their colors and according to the type of cut

In [15]:
sns.lineplot(data=data, x="carat", y="price")
Out[15]:
<AxesSubplot:xlabel='carat', ylabel='price'>

There is a direct relationship between carat (weight of the diamond) and the price of diamonds

In [ ]: